In [37]:
# Imports for pandas, and numpy
import numpy as np
import pandas as pd
# imports for seaborn to and matplotlib to allow graphing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
# import Titanic CSV - NOTE: adjust file path as neccessary
dTitTrain_DF = pd.read_csv('train.csv')
# Clearing of Columns not neccesary for statistical analysis
dTitTrain_DF = dTitTrain_DF.drop(["Name", "Ticket"], axis=1)
In [38]:
dTitTrain_DF.info()
dTitTrain_DF.describe()
Out[38]:
In [39]:
titAge = dTitTrain_DF.dropna(subset=['Age'])
In [40]:
# Distribution gender (adult and male)
ACmenData = dTitTrain_DF[dTitTrain_DF.Sex == 'male']
ACwomenData = dTitTrain_DF[dTitTrain_DF.Sex == 'female']
ACmenDataCount = float(ACmenData['Sex'].count())
ACwomenDataCount = float(ACwomenData['Sex'].count())
# Gender Specific DFs
AmenData = dTitTrain_DF[dTitTrain_DF.Sex == 'male'][dTitTrain_DF.Age >= 21]
AwomenData = dTitTrain_DF[dTitTrain_DF.Sex == 'female'][dTitTrain_DF.Age >= 21]
AmenDataCount = float(AmenData['Sex'].count())
AwomenDataCount = float(AwomenData['Sex'].count())
# print(menDataCount)
# print(womenDataCount)
In [41]:
# Age Specific Groups
adultData = titAge[titAge.Age >= 21]
childData = titAge[titAge.Age < 21]
adultDataCount = float(adultData['Age'].count())
childDataCount = float(childData['Age'].count())
#print(childDataCount)
#print(adultDataCount)
In [42]:
# Pclass
titClass1 = dTitTrain_DF[dTitTrain_DF.Pclass == 1]
titClass2 = dTitTrain_DF[dTitTrain_DF.Pclass == 2]
titClass3 = dTitTrain_DF[dTitTrain_DF.Pclass == 3]
In [43]:
# Alone or Family
dTitTrain_DF['SoloOrFamily'] = dTitTrain_DF.SibSp + dTitTrain_DF.Parch
dTitTrain_DF['SoloOrFamily'].loc[dTitTrain_DF['SoloOrFamily'] > 0] = 'Family'
dTitTrain_DF['SoloOrFamily'].loc[dTitTrain_DF['SoloOrFamily'] == 0] = 'Alone'
In [44]:
# Survivor Column (Yes or no)
dTitTrain_DF['Survivor']= dTitTrain_DF.Survived.map({0:'No', 1:'Yes'})
In [45]:
titCabin = dTitTrain_DF.dropna(subset=['Cabin'])
In [46]:
# Locational Data Groups
titDecks = titCabin['Cabin']
def deckGrab(tDK, cabLetter):
deckLevels = []
for level in tDK:
deckLevels.append(level[0])
TDF = pd.DataFrame(deckLevels)
TDF.columns = ['Cabin']
TDF = TDF[TDF.Cabin == cabLetter]
return TDF
def deckCount(tDK, cabLetter):
TDF = deckGrab(tDK, cabLetter)
return TDF[TDF.Cabin == cabLetter].count()['Cabin']
# print(deckCount(titDecks, "A"))
# print(deckCount(titDecks, "B"))
# print(deckCount(titDecks, "C"))
# print(deckCount(titDecks, "D"))
# print(deckCount(titDecks, "E"))
# print(deckCount(titDecks, "F"))
# print(deckCount(titDecks, "G"))
In [47]:
# embarked
titCherbourg = dTitTrain_DF[dTitTrain_DF.Embarked == 'C']
titQueenstown = dTitTrain_DF[dTitTrain_DF.Embarked == 'Q']
titSouthampton = dTitTrain_DF[dTitTrain_DF.Embarked == 'S']
In [48]:
printG = "Men account for " + str(ACmenDataCount) + " and " + "Women account for " + str(ACwomenDataCount) + " (Total Passengers: " + str(dTitTrain_DF.count()['Age']) + ")"
print(printG)
In [49]:
gSSC = sns.factorplot('Sex', data=dTitTrain_DF, kind='count')
gSSC.despine(left=True)
gSSC.set_ylabels("count of passengers")
Out[49]:
In [50]:
gGCSC= sns.factorplot('Pclass',order=[1,2,3], data=dTitTrain_DF, hue='Sex', kind='count')
gGCSC.despine(left=True)
gGCSC.set_ylabels("count of passengers")
Out[50]:
In [51]:
printA = "Youngest Passenger in the passenger list was " + str(titAge['Age'].min()) + " years of age." \
+ "\n" + "Oldest Passenger in the passenger list was " + str(titAge['Age'].max()) + " years of age." \
+ "\n" + "Mean of Passengers ages in the passenger list is " + str(titAge['Age'].mean()) + " years of age."
print(printA)
In [52]:
titAge['Age'].hist(bins=80)
Out[52]:
In [53]:
gCPS = sns.FacetGrid(titAge,hue='Pclass', aspect=4, hue_order=[1,2,3])
gCPS.map(sns.kdeplot,'Age', shade=True)
gCPS.set(xlim=(0,titAge['Age'].max()))
gCPS.add_legend()
Out[53]:
Reference:
Source: http://history.stackexchange.com/questions/17481/what-was-the-age-of-majority-in-1900-united-states
By the common law the age of majority is fixed at twenty-one years for both sexes, and, in the absence of any statute to >the contrary, every person under that age, whether male or female, is an infant. (21)
-- The American and English Encyclopedia of Law, Garland and McGeehee, 1900
By the common law, every person is, technically, an infant, until he is twenty-one years old; and, in legal presumption, is >not of sufficient discretion to contract an obligation at an earlier age.
-- Institutes of the Lawes of England by Coke (1628-1644). The laws on infants are at 171b.
In [54]:
# splits passengers into 3 categories (male of female if considered adult, and child if below 21 of age)
def minorOrAdult(passenger):
age, sex = passenger
if age < 21:
return 'child'
else:
return sex
# adds new column to dataframe that distinguishes a passenger as a child or an adult
dTitTrain_DF['PersonStatus'] = dTitTrain_DF[['Age', 'Sex']].apply(minorOrAdult, axis=1)
In [55]:
dTitTrain_DF['PersonStatus'].value_counts()
Out[55]:
In [56]:
gACPS = sns.FacetGrid(dTitTrain_DF, hue='PersonStatus', aspect=4, hue_order=['child', 'male', 'female'])
gACPS.map(sns.kdeplot,'Age', shade=True)
gACPS.set(xlim=(0,titAge['Age'].max()))
gACPS.add_legend()
Out[56]:
In [57]:
gGAC= sns.factorplot('Pclass', order=[1,2,3], data=dTitTrain_DF, hue='PersonStatus', kind='count',hue_order=['child','male','female'])
gGAC.despine(left=True)
gGAC.set_ylabels("count of passengers")
Out[57]:
In [58]:
sns.factorplot('SoloOrFamily', data=dTitTrain_DF, kind='count')
print("Alone: " + str(dTitTrain_DF[dTitTrain_DF.SoloOrFamily == "Alone"].count()['SoloOrFamily']))
print("Family: " + str(dTitTrain_DF[dTitTrain_DF.SoloOrFamily == "Family"].count()['SoloOrFamily']))
In [59]:
def prepareDeckGraph(titDecksDF):
deckLevels = []
for level in titDecksDF:
deckLevels.append(level[0])
T_DF = pd.DataFrame(deckLevels)
T_DF.columns = ['Cabin']
T_DF = T_DF[T_DF.Cabin != 'T']
return T_DF
gTD_DF = prepareDeckGraph(titDecks)
sns.factorplot('Cabin', order=['A','B','C','D','E','F','G'], data=gTD_DF, kind='count')
print("A: " + str(deckCount(titDecks, "A")))
print("B: " + str(deckCount(titDecks, "B")))
print("C: " + str(deckCount(titDecks, "C")))
print("D: " + str(deckCount(titDecks, "D")))
print("E: " + str(deckCount(titDecks, "E")))
print("F: " + str(deckCount(titDecks, "F")))
print("G: " + str(deckCount(titDecks, "G")))
In [60]:
sns.factorplot('Embarked', order=['C','Q','S'], data=dTitTrain_DF, hue='Pclass', kind='count', hue_order=[1,2,3])
# titCherbourg
# titQueenstown
# titSouthampton
print("Total:")
print("Cherbourg: " + str(titCherbourg.count()['Embarked']))
print("Queenstown: " + str(titQueenstown.count()['Embarked']))
print("Southampton: " + str(titSouthampton.count()['Embarked']))
print("")
print("Cherbourg: ")
print("Pclass 1 - " + str(titCherbourg[titCherbourg.Pclass == 1].count()['Embarked']))
print("Pclass 2 - " + str(titCherbourg[titCherbourg.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titCherbourg[titCherbourg.Pclass == 3].count()['Embarked']))
print("")
print("Queenstown: ")
print("Pclass 1 - " + str(titQueenstown[titQueenstown.Pclass == 1].count()['Embarked']))
print("Pclass 2 - " + str(titQueenstown[titQueenstown.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titQueenstown[titQueenstown.Pclass == 3].count()['Embarked']))
print("")
print("Southampton: ")
print("Pclass 1 - " + str(titSouthampton[titSouthampton.Pclass == 1].count()['Embarked']))
print("Pclass 2 - " + str(titSouthampton[titSouthampton.Pclass == 2].count()['Embarked']))
print("Pclass 3 - " + str(titSouthampton[titSouthampton.Pclass == 3].count()['Embarked']))
In [61]:
# Survivors Overall
gSOA = sns.factorplot('Survivor', data=dTitTrain_DF, kind='count')
gSOA.despine(left=True)
gSOA.set_ylabels("count of passengers")
print("Survivor: " + str(dTitTrain_DF[dTitTrain_DF.Survivor == "Yes"].count()['Survivor']))
print("Non-Survivor: " + str(dTitTrain_DF[dTitTrain_DF.Survivor == "No"].count()['Survivor']))
In [62]:
# Series probability - access probability of survived in men and women
menProb = ACmenData.groupby('Sex').Survived.mean()
womenProb = ACwomenData.groupby('Sex').Survived.mean()
menPercent = menProb[0]*100
womenPercent = womenProb[0]*100
print("Men Survivalbility: ")
print(menProb[0])
print("Women Survivalbility: ")
print(womenProb[0])
gSSP = sns.factorplot("Sex", "Survived", data=dTitTrain_DF, kind="bar", size=5)
gSSP.despine(left=True)
gSSP.set_ylabels("survival probability")
Out[62]:
In [63]:
# Determines the probability of survival for a given Pclass
def define_pClassProb(dataFrameIN, numClass):
classEntries = dataFrameIN[dataFrameIN.Pclass == numClass]
sClassEntries = classEntries[classEntries.Survived == 1]
cClassEntries = (classEntries.count(numeric_only=True)['Pclass']).astype(float)
cSClassEntries = (sClassEntries.count(numeric_only=True)['Pclass']).astype(float)
return (cSClassEntries/cClassEntries)
print("Class 1 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 1))
print("Class 2 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 2))
print("Class 3 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 3))
gCS = sns.factorplot("Pclass", "Survived",order=[1,2,3],data=dTitTrain_DF, kind="bar", size=5)
gCS.despine(left=True)
gCS.set_ylabels("survival probability")
Out[63]:
In [64]:
print("Class 1 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 1))
print("Class 2 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 2))
print("Class 3 Survivality: ")
print(define_pClassProb(dTitTrain_DF, 3))
sns.factorplot("Pclass", "Survived",order=[1,2,3], data=dTitTrain_DF, kind='point')
Out[64]:
In [65]:
# determines the probability of survival for genders in a given Pclass
def define_pClassProbSex(dataFrameIN, numClass, sex):
classEntries = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.Sex == sex]
sClassEntries = classEntries[classEntries.Survived == 1]
cClassEntries = (classEntries.count(numeric_only=True)['Pclass']).astype(float)
cSClassEntries = (sClassEntries.count(numeric_only=True)['Pclass']).astype(float)
return (cSClassEntries/cClassEntries)
print("Class 1 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 1, 'male'))
print("Class 1 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 1, 'female'))
print("Class 2 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 2, 'male'))
print("Class 2 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 2, 'female'))
print("Class 3 Survivality(MALE): ")
print(define_pClassProbSex(dTitTrain_DF, 3, 'male'))
print("Class 3 Survivality(FEMALE): ")
print(define_pClassProbSex(dTitTrain_DF, 3, 'female'))
gGCSP = sns.factorplot("Pclass", "Survived",order=[1,2,3],data=dTitTrain_DF,hue='Sex', kind='bar')
gGCSP.despine(left=True)
gGCSP.set_ylabels("survival probability")
Out[65]:
In [66]:
sns.factorplot("Pclass", "Survived", hue='Sex',order=[1,2,3], data=dTitTrain_DF, kind='point')
Out[66]:
In [90]:
#Determine probability of survival of children in a given Pclass
def define_pClassChildProb(dataFrameIN, numClass):
ChildDF = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == 'child']
ChildSurvived = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == 'child'][dataFrameIN.Survivor == 'Yes']
totalCChild = ChildDF.count()['PassengerId'].astype(float)
CChildSurvived = ChildSurvived.count()['PassengerId'].astype(float)
return CChildSurvived/totalCChild
def define_pClassAdultProb(dataFrameIN, numClass, sex):
AdultDF = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == sex]
AdultSurvived = dataFrameIN[dataFrameIN.Pclass == numClass][dataFrameIN.PersonStatus == sex][dataFrameIN.Survivor == 'Yes']
totalCAdult = AdultDF.count()['PassengerId'].astype(float)
CAdultSurvived = AdultSurvived.count()['PassengerId'].astype(float)
return CAdultSurvived/totalCAdult
print("PClass 1 Survival Child: ")
print(define_pClassChildProb(dTitTrain_DF, 1))
print("PClass 1 Survival Female: ")
print(define_pClassAdultProb(dTitTrain_DF, 1, 'female'))
print("PClass 1 Survival Male: ")
print(define_pClassAdultProb(dTitTrain_DF, 1, 'male'))
print("-----------")
print("PClass 2 Survival Child: ")
print(define_pClassChildProb(dTitTrain_DF, 2))
print("PClass 2 Survival Female: ")
print(define_pClassAdultProb(dTitTrain_DF, 2, 'female'))
print("PClass 2 Survival Male: ")
print(define_pClassAdultProb(dTitTrain_DF, 2, 'male'))
print("-----------")
print("PClass 3 Survival Child: ")
print(define_pClassChildProb(dTitTrain_DF, 3))
print("PClass 3 Survival Female: ")
print(define_pClassAdultProb(dTitTrain_DF, 3, 'female'))
print("PClass 3 Survival Male: ")
print(define_pClassAdultProb(dTitTrain_DF, 3, 'male'))
sns.factorplot("Pclass", "Survived", hue='PersonStatus',order=[1,2,3], data=dTitTrain_DF, kind='point')
Out[90]:
In [68]:
#sns.lmplot('Age', 'Survived', data=dTitTrain_DF)
pSBA = sns.boxplot(data=dTitTrain_DF, x='Survived', y='Age')
pSBA.set(title='Age Distribution by Survival',
xlabel = 'Survival',
ylabel = 'Age Distrobution',
xticklabels = ['Died', 'Survived'])
Out[68]:
In [69]:
# Using Solo or family column created earlier in passenger distributions section created a separate dataframes for traveling
#alone and with family passengers
familyPass = dTitTrain_DF[dTitTrain_DF['SoloOrFamily'] == "Family"]
alonePass = dTitTrain_DF[dTitTrain_DF['SoloOrFamily'] == "Alone"]
# Creates a list of surviving family and alone passengers
AFamilyPass = familyPass[familyPass.Survivor == "Yes"]
AAlonePass = alonePass[alonePass.Survivor == "Yes"]
# Determines the probability of survival for passengers that traveled alone and with family
pAF = float(AFamilyPass['SoloOrFamily'].count()) / float(familyPass['SoloOrFamily'].count())
pAA = float(AAlonePass['SoloOrFamily'].count()) / float(alonePass['SoloOrFamily'].count())
print("Probability of Survival being with Family: ")
print(pAF)
print("")
print("Probability of Survival being alone: ")
print(pAA)
gSSP = sns.factorplot("SoloOrFamily", "Survived", data=dTitTrain_DF, kind="bar", size=5)
gSSP.despine(left=True)
gSSP.set_ylabels("survival probability")
Out[69]:
In [70]:
#sns.lmplot('Age', 'Survived',hue='Pclass', data=dTitanic_DF, hue_order=[1,2,3])
pACSB = sns.boxplot(data = dTitTrain_DF.dropna(subset = ['Age']).sort_values('Pclass'), x='Pclass', y='Age', hue='Survivor')
pACSB.set(title='Age by Class and Survival - Box Plot', xlabel='Pclass')
pACSB.legend(bbox_to_anchor=(1.05, .7), loc=2, title = 'Survived',borderaxespad=0.)
Out[70]:
In [71]:
#sns.lmplot('Age', 'Survived', hue='Sex' ,data=dTitanic_DF)
pAGSB = sns.boxplot(data=dTitTrain_DF.dropna(subset = ['Age']), x= 'Sex', y= 'Age', hue='Survivor')
pAGSB.set(title='Age by Gender and Survival - Box Plot')
pAGSB.legend(bbox_to_anchor=(1.05, .7), loc=2, title = 'Survived',borderaxespad=0.)
Out[71]:
In [103]:
# Determining better odds which will be compared to test group (First comparison - Pclass and age group)
import csv
# # Manual - Age Group and gender adult with highest above 49%
# print(define_pClassChildProb(dTitTrain_DF, 1))
# print(define_pClassAdultProb(dTitTrain_DF, 1, 'female'))
# print(define_pClassChildProb(dTitTrain_DF, 2))
# print(define_pClassAdultProb(dTitTrain_DF, 2, 'female'))
# print(define_pClassAdultProb(dTitTrain_DF, 3, 'female'))
# #sibsp and parch
test_file = open('test.csv', 'rb')
test_file_object = csv.reader(test_file)
header = test_file_object.next()
prediction_file = open("genderPclassbasedmodel.csv", "wb")
prediction_file_object = csv.writer(prediction_file)
prediction_file_object.writerow(["PassengerId", "Survived"])
for row in test_file_object: # For each row in test.csv
weight = 0.0
if row[1] == 1:
weight = weight + 9
elif row[1] == 2:
weight = weight + 8
else:
weight = 5
if row[3] == 'female':
weight = weight + 8
else:
weight = weight + 2
if row[4] < 21:
# child
weight = weight + 6
else:
# adult
weight = weight + 5
aFam = row[5] + row[6]
if aFam > 0:
weight = weight + 5
else:
weight = weight + 3
weightScore = weight/40.0
print(str(weightScore))
if(weight >= .5):
prediction_file_object.writerow([row[0],'1'])
else:
prediction_file_object.writerow([row[0],'0'])
#prediction_file_object.writerow([row[0],'1'])
#prediction_file_object.writerow([row[0],'0'])
test_file.close()
prediction_file.close()
In [ ]: